{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import scipy.sparse as sp\n",
    "import pandas as pd\n",
    "#from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.metrics import log_loss\n",
    "import lightgbm as lgbm\n",
    "from lightgbm.sklearn import LGBMClassifier\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dpath = './data/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df  = pd.read_csv(dpath+\"train.csv\", nrows=50000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_ROUNDS = 10000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 50000 entries, 0 to 49999\n",
      "Data columns (total 24 columns):\n",
      "id                  50000 non-null uint64\n",
      "click               50000 non-null int64\n",
      "hour                50000 non-null int64\n",
      "C1                  50000 non-null int64\n",
      "banner_pos          50000 non-null int64\n",
      "site_id             50000 non-null object\n",
      "site_domain         50000 non-null object\n",
      "site_category       50000 non-null object\n",
      "app_id              50000 non-null object\n",
      "app_domain          50000 non-null object\n",
      "app_category        50000 non-null object\n",
      "device_id           50000 non-null object\n",
      "device_ip           50000 non-null object\n",
      "device_model        50000 non-null object\n",
      "device_type         50000 non-null int64\n",
      "device_conn_type    50000 non-null int64\n",
      "C14                 50000 non-null int64\n",
      "C15                 50000 non-null int64\n",
      "C16                 50000 non-null int64\n",
      "C17                 50000 non-null int64\n",
      "C18                 50000 non-null int64\n",
      "C19                 50000 non-null int64\n",
      "C20                 50000 non-null int64\n",
      "C21                 50000 non-null int64\n",
      "dtypes: int64(14), object(9), uint64(1)\n",
      "memory usage: 9.2+ MB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "str_col=[\"site_id\",\"site_domain\",\"site_category\",\"app_id\", \n",
    "    \"app_domain\",\"app_category\",\"device_id\",\"device_ip\",\"device_model\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "X=df.drop(['id','click'],axis=1)\n",
    "y=df['click']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "le=LabelEncoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in str_col:\n",
    "    X[col]=le.fit_transform(X[col])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>hour</th>\n",
       "      <th>C1</th>\n",
       "      <th>banner_pos</th>\n",
       "      <th>site_id</th>\n",
       "      <th>site_domain</th>\n",
       "      <th>site_category</th>\n",
       "      <th>app_id</th>\n",
       "      <th>app_domain</th>\n",
       "      <th>app_category</th>\n",
       "      <th>device_id</th>\n",
       "      <th>...</th>\n",
       "      <th>device_type</th>\n",
       "      <th>device_conn_type</th>\n",
       "      <th>C14</th>\n",
       "      <th>C15</th>\n",
       "      <th>C16</th>\n",
       "      <th>C17</th>\n",
       "      <th>C18</th>\n",
       "      <th>C19</th>\n",
       "      <th>C20</th>\n",
       "      <th>C21</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>14102100</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>81</td>\n",
       "      <td>569</td>\n",
       "      <td>2</td>\n",
       "      <td>535</td>\n",
       "      <td>19</td>\n",
       "      <td>0</td>\n",
       "      <td>2858</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>15706</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>-1</td>\n",
       "      <td>79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>14102100</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>81</td>\n",
       "      <td>569</td>\n",
       "      <td>2</td>\n",
       "      <td>535</td>\n",
       "      <td>19</td>\n",
       "      <td>0</td>\n",
       "      <td>2858</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>15704</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>14102100</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>81</td>\n",
       "      <td>569</td>\n",
       "      <td>2</td>\n",
       "      <td>535</td>\n",
       "      <td>19</td>\n",
       "      <td>0</td>\n",
       "      <td>2858</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>15704</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>14102100</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>81</td>\n",
       "      <td>569</td>\n",
       "      <td>2</td>\n",
       "      <td>535</td>\n",
       "      <td>19</td>\n",
       "      <td>0</td>\n",
       "      <td>2858</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>15706</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>14102100</td>\n",
       "      <td>1005</td>\n",
       "      <td>1</td>\n",
       "      <td>684</td>\n",
       "      <td>338</td>\n",
       "      <td>0</td>\n",
       "      <td>535</td>\n",
       "      <td>19</td>\n",
       "      <td>0</td>\n",
       "      <td>2858</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>18993</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>2161</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>-1</td>\n",
       "      <td>157</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       hour    C1  banner_pos  site_id  site_domain  site_category  app_id  \\\n",
       "0  14102100  1005           0       81          569              2     535   \n",
       "1  14102100  1005           0       81          569              2     535   \n",
       "2  14102100  1005           0       81          569              2     535   \n",
       "3  14102100  1005           0       81          569              2     535   \n",
       "4  14102100  1005           1      684          338              0     535   \n",
       "\n",
       "   app_domain  app_category  device_id ...   device_type  device_conn_type  \\\n",
       "0          19             0       2858 ...             1                 2   \n",
       "1          19             0       2858 ...             1                 0   \n",
       "2          19             0       2858 ...             1                 0   \n",
       "3          19             0       2858 ...             1                 0   \n",
       "4          19             0       2858 ...             1                 0   \n",
       "\n",
       "     C14  C15  C16   C17  C18  C19     C20  C21  \n",
       "0  15706  320   50  1722    0   35      -1   79  \n",
       "1  15704  320   50  1722    0   35  100084   79  \n",
       "2  15704  320   50  1722    0   35  100084   79  \n",
       "3  15706  320   50  1722    0   35  100084   79  \n",
       "4  18993  320   50  2161    0   35      -1  157  \n",
       "\n",
       "[5 rows x 22 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import StratifiedKFold\n",
    "\n",
    "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_n_estimators(params , X_train , y_train , early_stopping_rounds=100):\n",
    "    lgbm_params = params.copy()\n",
    "     \n",
    "    lgbmtrain = lgbm.Dataset(X_train , y_train )\n",
    "     \n",
    "    #num_boost_round为弱分类器数目，下面的代码参数里因为已经设置了early_stopping_rounds\n",
    "    #即性能未提升的次数超过过早停止设置的数值，则停止训练\n",
    "    cv_result = lgbm.cv(lgbm_params , lgbmtrain , num_boost_round=MAX_ROUNDS , nfold=5,  metrics='binary_logloss' , early_stopping_rounds=early_stopping_rounds,seed=3 )\n",
    "     \n",
    "    print('best n_estimators:' , len(cv_result['binary_logloss-mean']))\n",
    "    print('best cv score:' , cv_result['binary_logloss-mean'][-1])\n",
    "     \n",
    "    return len(cv_result['binary_logloss-mean'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 50000 entries, 0 to 49999\n",
      "Data columns (total 22 columns):\n",
      "hour                50000 non-null int64\n",
      "C1                  50000 non-null int64\n",
      "banner_pos          50000 non-null int64\n",
      "site_id             50000 non-null int64\n",
      "site_domain         50000 non-null int64\n",
      "site_category       50000 non-null int64\n",
      "app_id              50000 non-null int64\n",
      "app_domain          50000 non-null int64\n",
      "app_category        50000 non-null int64\n",
      "device_id           50000 non-null int64\n",
      "device_ip           50000 non-null int64\n",
      "device_model        50000 non-null int64\n",
      "device_type         50000 non-null int64\n",
      "device_conn_type    50000 non-null int64\n",
      "C14                 50000 non-null int64\n",
      "C15                 50000 non-null int64\n",
      "C16                 50000 non-null int64\n",
      "C17                 50000 non-null int64\n",
      "C18                 50000 non-null int64\n",
      "C19                 50000 non-null int64\n",
      "C20                 50000 non-null int64\n",
      "C21                 50000 non-null int64\n",
      "dtypes: int64(22)\n",
      "memory usage: 8.4 MB\n"
     ]
    }
   ],
   "source": [
    "X.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sika/.local/lib/python3.6/site-packages/lightgbm/basic.py:731: UserWarning: categorical_feature keyword has been found in `params` and will be ignored.\n",
      "Please use categorical_feature argument of the Dataset constructor to pass this parameter.\n",
      "  .format(key))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "best n_estimators: 2\n",
      "best cv score: 0.4520524330521171\n"
     ]
    }
   ],
   "source": [
    "params = {'boosting_type': 'gbdt',\n",
    "          'objective': 'binary',\n",
    "          'is_unbalance':True,\n",
    "          'categorical_feature': [14,15,16,17,18,19,20,21],\n",
    "          'n_jobs': 4,\n",
    "          'learning_rate': 0.1,\n",
    "          #'n_estimators':n_estimators_1,\n",
    "          'num_leaves': 60,\n",
    "          'max_depth': 10,\n",
    "          'colsample_bytree': 0.7,\n",
    "          'verbosity':5\n",
    "         }\n",
    "n_estimators_1 = get_n_estimators(params, X , y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 4 candidates, totalling 20 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.\n",
      "[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   10.4s\n",
      "[Parallel(n_jobs=4)]: Done  18 out of  20 | elapsed:   12.0s remaining:    1.3s\n",
      "[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:   12.2s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=3, shuffle=True),\n",
       "       error_score='raise-deprecating',\n",
       "       estimator=LGBMClassifier(boosting_type='gbdt',\n",
       "        categorical_feature=[14, 15, 16, 17, 18, 19, 20, 21],\n",
       "        class_weight=None, colsample_bytree=0.7, importance_type='split',\n",
       "        is_unbalance=True, learning_rate=0.1, max_depth=6,\n",
       "        min_child_samples=20, min_child_weight=0.001, min_....0, reg_lambda=0.0, silent=False,\n",
       "        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),\n",
       "       fit_params=None, iid='warn', n_jobs=4,\n",
       "       param_grid={'num_leaves': range(50, 90, 10)},\n",
       "       pre_dispatch='2*n_jobs', refit=False, return_train_score='warn',\n",
       "       scoring='neg_log_loss', verbose=5)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "params = {'boosting_type': 'gbdt',\n",
    "          'objective': 'binary',\n",
    "          'is_unbalance':True,\n",
    "          'categorical_feature': [14,15,16,17,18,19,20,21],\n",
    "          'n_jobs': 4,\n",
    "          'learning_rate': 0.1,\n",
    "          'n_estimators':n_estimators_1,\n",
    "          #'num_leaves': 100,\n",
    "          'max_depth': 6,\n",
    "          'colsample_bytree': 0.7,\n",
    "         }\n",
    "lg = LGBMClassifier(silent=False,  **params)\n",
    "num_leaves_s = range(50,90,10) #50,60,70,80\n",
    "tuned_parameters = dict( num_leaves = num_leaves_s)\n",
    "\n",
    "grid_search = GridSearchCV(lg, n_jobs=4, param_grid=tuned_parameters, cv = kfold, scoring=\"neg_log_loss\", verbose=5, refit = False)\n",
    "grid_search.fit(X , y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-0.4538230720127494\n",
      "{'num_leaves': 60}\n"
     ]
    }
   ],
   "source": [
    "print(grid_search.best_score_)\n",
    "print(grid_search.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 4 candidates, totalling 20 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.\n",
      "[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    1.9s\n",
      "[Parallel(n_jobs=4)]: Done  18 out of  20 | elapsed:    3.2s remaining:    0.4s\n",
      "[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    3.4s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=3, shuffle=True),\n",
       "       error_score='raise-deprecating',\n",
       "       estimator=LGBMClassifier(boosting_type='gbdt',\n",
       "        categorical_feature=[14, 15, 16, 17, 18, 19, 20, 21],\n",
       "        class_weight=None, colsample_bytree=0.7, importance_type='split',\n",
       "        is_unbalance=True, learning_rate=0.1, max_depth=6,\n",
       "        min_child_samples=20, min_child_weight=0.001, min_....0, reg_lambda=0.0, silent=False,\n",
       "        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),\n",
       "       fit_params=None, iid='warn', n_jobs=4,\n",
       "       param_grid={'min_child_samples': range(10, 50, 10)},\n",
       "       pre_dispatch='2*n_jobs', refit=False, return_train_score='warn',\n",
       "       scoring='neg_log_loss', verbose=5)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "params = {'boosting_type': 'gbdt',\n",
    "          'objective': 'binary',\n",
    "          'is_unbalance':True,\n",
    "          'categorical_feature': [14,15,16,17,18,19,20,21],\n",
    "          'n_jobs': 4,\n",
    "          'learning_rate': 0.1,\n",
    "          'n_estimators':n_estimators_1,\n",
    "          'num_leaves': 60,\n",
    "          'max_depth': 6,\n",
    "          'colsample_bytree': 0.7,\n",
    "         }\n",
    "lg = LGBMClassifier(silent=False,  **params)\n",
    "\n",
    "min_child_samples_s = range(10,50,10) \n",
    "tuned_parameters = dict( min_child_samples = min_child_samples_s)\n",
    "\n",
    "grid_search = GridSearchCV(lg, n_jobs=4,  param_grid=tuned_parameters, cv = kfold, scoring=\"neg_log_loss\", verbose=5, refit = False)\n",
    "grid_search.fit(X , y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-0.45377342808969356\n",
      "{'min_child_samples': 10}\n"
     ]
    }
   ],
   "source": [
    "print(grid_search.best_score_)\n",
    "print(grid_search.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 5 candidates, totalling 25 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.\n",
      "[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    1.8s\n",
      "[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    4.4s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=3, shuffle=True),\n",
       "       error_score='raise-deprecating',\n",
       "       estimator=LGBMClassifier(boosting_type='gbdt',\n",
       "        categorical_feature=[14, 15, 16, 17, 18, 19, 20, 21],\n",
       "        class_weight=None, colsample_bytree=1.0, importance_type='split',\n",
       "        is_unbalance=True, learning_rate=0.1, max_depth=6,\n",
       "        min_child_samples=10, min_child_weight=0.001, min_....0, reg_lambda=0.0, silent=False,\n",
       "        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),\n",
       "       fit_params=None, iid='warn', n_jobs=4,\n",
       "       param_grid={'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9]},\n",
       "       pre_dispatch='2*n_jobs', refit=False, return_train_score='warn',\n",
       "       scoring='neg_log_loss', verbose=5)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "params = {'boosting_type': 'gbdt',\n",
    "          'objective': 'binary',\n",
    "          'is_unbalance':True,\n",
    "          'categorical_feature': [14,15,16,17,18,19,20,21],\n",
    "          'n_jobs': 4,\n",
    "          'learning_rate': 0.1,\n",
    "          'n_estimators':n_estimators_1,\n",
    "          'num_leaves': 60,\n",
    "          'max_depth': 6,\n",
    "          'min_child_samples':10\n",
    "          #'colsample_bytree': 0.7,\n",
    "         }\n",
    "lg = LGBMClassifier(silent=False,  **params)\n",
    "\n",
    "colsample_bytree_s = [i/10.0 for i in range(5,10)]\n",
    "tuned_parameters = dict( colsample_bytree = colsample_bytree_s)\n",
    "\n",
    "grid_search = GridSearchCV(lg, n_jobs=4,  param_grid=tuned_parameters, cv = kfold, scoring=\"neg_log_loss\", verbose=5, refit = False)\n",
    "grid_search.fit(X , y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-0.4534446227834011\n",
      "{'colsample_bytree': 0.8}\n"
     ]
    }
   ],
   "source": [
    "print(grid_search.best_score_)\n",
    "print(grid_search.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sika/.local/lib/python3.6/site-packages/lightgbm/basic.py:731: UserWarning: categorical_feature keyword has been found in `params` and will be ignored.\n",
      "Please use categorical_feature argument of the Dataset constructor to pass this parameter.\n",
      "  .format(key))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "best n_estimators: 18\n",
      "best cv score: 0.45357246814488217\n"
     ]
    }
   ],
   "source": [
    "params = {'boosting_type': 'gbdt',\n",
    "          'objective': 'binary',\n",
    "          'is_unbalance':True,\n",
    "          'categorical_feature':[14,15,16,17,18,19,20,21],\n",
    "          'n_jobs': 4,\n",
    "          'learning_rate': 0.01,\n",
    "          #'n_estimators':n_estimators_1,\n",
    "          'num_leaves': 60,\n",
    "          'max_depth': 6,\n",
    "          'min_child_samples':10,\n",
    "          'colsample_bytree': 0.8\n",
    "         }\n",
    "n_estimators_2 = get_n_estimators(params , X , y, early_stopping_rounds=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sika/.local/lib/python3.6/site-packages/lightgbm/basic.py:731: UserWarning: categorical_feature keyword has been found in `params` and will be ignored.\n",
      "Please use categorical_feature argument of the Dataset constructor to pass this parameter.\n",
      "  .format(key))\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LGBMClassifier(boosting_type='gbdt',\n",
       "        categorical_feature=[14, 15, 16, 17, 18, 19, 20, 21],\n",
       "        class_weight=None, colsample_bytree=0.8, importance_type='split',\n",
       "        is_unbalance=True, learning_rate=0.01, max_depth=6,\n",
       "        metric='logloss', min_child_samples=10, min_child_weight=0.001,\n",
       "        min_split_gain=0.0, n_estimators=2, n_jobs=4, num_leaves=60,\n",
       "        objective='binary', random_state=None, reg_alpha=0.0,\n",
       "        reg_lambda=0.0, silent=False, subsample=1.0,\n",
       "        subsample_for_bin=200000, subsample_freq=0)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "params = {'boosting_type': 'gbdt',\n",
    "          'objective': 'binary',\n",
    "          'metric':'logloss',\n",
    "          'is_unbalance':True,\n",
    "          'categorical_feature': [14,15,16,17,18,19,20,21],\n",
    "          'n_jobs': 4,\n",
    "          'learning_rate': 0.01,\n",
    "          'n_estimators':n_estimators_1,\n",
    "          'num_leaves': 60,\n",
    "          'max_depth': 6,\n",
    "          'min_child_samples':10,\n",
    "          'colsample_bytree': 0.8\n",
    "         }\n",
    "\n",
    "lg = LGBMClassifier(silent=False,  **params)\n",
    "lg.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "pickle.dump(lg, open(\"ctr_lightgbm.pkl\", 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
